In this document we analyse the results from the Weibull parameter estimation on the METABRIC dataset.
cells <- getCells()
clinical_data <- getClinical()
combinations <- getCombinations()
all_parameters <- getALLParameters()
# Compute means of each combination
means <- all_parameters %>%
group_by(phenotype_from, phenotype_to ) %>%
summarise(mean_shape = mean(shape),
mean_scale = mean(scale))
## `summarise()` has grouped output by 'phenotype_from'. You can override using
## the `.groups` argument.
all_distances_data <- readRDS(file = here('scratch/AUCScaledSlides_300_ALL.rds'))
all_distances_data <- all_distances_data %>% as_tibble %>%
mutate(distance_window = WinMean) %>%
mutate(phenotype_combo = paste(phenotype_from, phenotype_to, sep='_to_')) %>%
dplyr::select(tnumber, phenotype_combo, `N.per.mm2.scaled`, distance_window) %>%
mutate(new = `N.per.mm2.scaled` * 1000) %>%
mutate(new =round(new))
estimated_distances <- all_distances_data %>%
unite('unique_sample', c(tnumber, phenotype_combo), remove=F) %>% filter(unique_sample %in% all_parameters$unique_sample)
cell_occurences <- getCellCounts()
Prior to parameter estimation, parameters are initialized on data on the [0,100] interval using the ‘fitdist’ package. This is a quick and sloppy estimation.
Samples can be divided into four categories: 1. No initialization and no estimation possible (combination does not occur) 2. No initialization, but estimation (distances outside [0,100] interval or initialization exceeded time) 3. Initialization, but no estimation (Not possible to fit Weibull distribution) 4. No initialization and no estimation, although combination is present.
This last category requires more analyses to find out why the estimation failed.
# initial_parameters <- tibble()
# for (c in combinations){
# init_params <- readRDS(here(paste('scratch/init_parameters/init_params_', c,'.rds', sep='')))
# initial_parameters <- bind_rows(initial_parameters, init_params)
# }
#
# initial_parameters <- merge(x= initial_parameters %>%
# filter(term=='shape') %>%
# select(c(tnumber,combo, estimate))%>%
# rename(shape = estimate),
# y= initial_parameters %>%
# filter(term=='scale') %>%
# select(c(tnumber,combo, estimate)) %>%
# rename(scale = estimate),
# by.x=c('tnumber', 'combo'),
# by.y= c('tnumber', 'combo')) %>%
# unite('unique_sample', c(tnumber, combo), remove=FALSE) %>%
# separate(combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove=FALSE) %>%
# rename(phenotype_combo = combo)
# saveRDS(initial_parameters, file=here('scratch/initial_parameters.rds'))
initial_parameters <- read_rds(here('scratch/initial_parameters.rds'))
not_estimated_samples <- initial_parameters %>% filter(! unique_sample %in% all_parameters$unique_sample)
not_initialized_samples <- all_parameters %>% filter(unique_sample %in% setdiff(all_parameters$unique_sample, intersect(initial_parameters$unique_sample, all_parameters$unique_sample)))
cell_occurences_combinations <- getCombinationCounts()
all_possible_samples <- cell_occurences_combinations %>% filter(n_to > 0 & n_from > 0)
not_estimated_and_initialized_samples <- cell_occurences_combinations %>% filter(! unique_sample %in% all_parameters$unique_sample) %>% filter(! unique_sample %in% initial_parameters$unique_sample) %>% filter(n_to > 0 & n_from > 0)
print(paste('Number of samples possible to estimate (combination exists):', nrow(all_possible_samples)))
## [1] "Number of samples possible to estimate (combination exists): 314761"
print(paste('Total number of samples from which the parameters are initialized:', nrow(initial_parameters), '(', round(nrow(initial_parameters)*100/nrow(all_possible_samples),2), '%)'))
## [1] "Total number of samples from which the parameters are initialized: 264056 ( 83.89 %)"
print(paste('Total number of samples from which the parameters are estimated:', nrow(all_parameters), '(', round(nrow(all_parameters)*100/nrow(all_possible_samples),2), '%)'))
## [1] "Total number of samples from which the parameters are estimated: 88856 ( 28.23 %)"
print(paste('Number of samples from which the parameters are not estimated and not initialized:', nrow(not_estimated_and_initialized_samples),'(', round(nrow(not_estimated_and_initialized_samples)*100/nrow(all_possible_samples),2), '%)'))
## [1] "Number of samples from which the parameters are not estimated and not initialized: 48295 ( 15.34 %)"
result = c()
for (i in 1:100){
more <- (all_possible_samples %>% filter(n_from > i & n_to > i))
percentage <- nrow(more[(more$unique_sample %in% all_parameters$unique_sample),])/nrow(more)
result = c(result,percentage)
}
ggplot(data =tibble(n_cells = 1:100, estimated = result*100)) + geom_point(aes(x=n_cells, y=estimated)) + theme_bw() + ylab('percentage estimated') + xlab('n cells of both types in combination') + ylim(0,100) + ggtitle('cumulative distribution function of estimation count')
ggsave(here('output/Distance_results/Missing_estimations_analysis/estimated_samples_percentages.pdf'),width=7,height=5)
How many combinations have a valid number of estimations?
result <- data.frame(matrix(nrow=0, ncol=2))
for (i in seq(0,95,5)){
int <- c(i, nrow(all_parameters %>% count(phenotype_combo) %>% filter(n > i)) )
result <- rbind(result, int)
}
result <- setNames(result, c('percentage', 'features'))
ggplot(data=result) + geom_point(aes(x=percentage, y=features)) + theme_bw() + xlab('estimation percentage') + ylab('n features')
ggsave(here('output/Distance_results/Missing_estimations_analysis/estimated_samples_filtering_percentages.pdf'),width=7,height=5)
Which combinations could not be estimated at all?
not_estimated_combinations <- setdiff(combinations, unique(all_parameters$phenotype_combo))
print(str_sort(not_estimated_combinations))
## [1] "CD4^{+} T cells & APCs_to_CD57^{+}"
## [2] "CD57^{+}_to_CK^{+} CXCL12^{+}"
## [3] "CD57^{+}_to_CK8-18^{+} ER^{hi}"
## [4] "CD57^{+}_to_CK8-18^{hi}CXCL12^{hi}"
## [5] "CD57^{+}_to_Ep Ki67^{+}"
## [6] "CD57^{+}_to_Fibroblasts FSP1^{+}"
## [7] "CD57^{+}_to_Ki67^{+}"
## [8] "CD57^{+}_to_MHC^{hi}CD15^{+}"
## [9] "CK^{lo}ER^{med}_to_Granulocytes"
## [10] "CK8-18^{+} ER^{hi}_to_CD57^{+}"
## [11] "CK8-18^{hi}CXCL12^{hi}_to_CD57^{+}"
## [12] "CK8-18^{hi}ER^{lo}_to_Ki67^{+}"
## [13] "Ep CD57^{+}_to_Macrophages & granulocytes"
## [14] "Fibroblasts FSP1^{+}_to_CD4^{+} T cells"
## [15] "Fibroblasts FSP1^{+}_to_CD57^{+}"
## [16] "Fibroblasts FSP1^{+}_to_Macrophages & granulocytes"
## [17] "Fibroblasts FSP1^{+}_to_MHC I & II^{hi}"
## [18] "Granulocytes_to_CD57^{+}"
## [19] "Granulocytes_to_CK^{lo}ER^{med}"
## [20] "HER2^{+}_to_MHC I^{hi}CD57^{+}"
## [21] "Ki67^{+}_to_Basal"
## [22] "Ki67^{+}_to_CD57^{+}"
## [23] "Macrophages_to_CD57^{+}"
## [24] "MHC I^{hi}CD57^{+}_to_Ep Ki67^{+}"
## [25] "MHC I^{hi}CD57^{+}_to_Granulocytes"
## [26] "MHC I^{hi}CD57^{+}_to_HER2^{+}"
## [27] "MHC^{hi}CD15^{+}_to_B cells"
## [28] "MHC^{hi}CD15^{+}_to_CD57^{+}"
## [29] "MHC^{hi}CD15^{+}_to_CK^{+} CXCL12^{+}"
## [30] "MHC^{hi}CD15^{+}_to_Fibroblasts FSP1^{+}"
Look at the slides of the samples that were not estimated, but initialized.
sample_points_notest <- sample_n(not_estimated_samples, 5)
for (row in 1:nrow(sample_points_notest)){
print(show_distance_distribution(sample_points_notest[row,],0.1))
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/not_estimated_samples.pdf'), plot = plot_grid(show_distance_distribution(sample_points_notest[1,]),show_distance_distribution(sample_points_notest[2,]),show_distance_distribution(sample_points_notest[3,]),
show_distance_distribution(sample_points_notest[4,]),show_distance_distribution(sample_points_notest[5,]), ncol=1), width= 10, height=20 )
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Warning: Removed 14 rows containing missing values (`position_stack()`).
Look at the slides of the samples that were not initialized, but estimated.
sample_points_notinit <- sample_n(not_initialized_samples, 5)
for (row in 1:nrow(sample_points_notinit)){
print(show_distance_distribution(sample_points_notinit[row,],0.05))
}
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/not_init_samples.pdf'), plot = plot_grid(show_distance_distribution(sample_points_notinit[1,]),show_distance_distribution(sample_points_notinit[2,]),show_distance_distribution(sample_points_notinit[3,]),
show_distance_distribution(sample_points_notinit[4,]),show_distance_distribution(sample_points_notinit[5,]), ncol=1), width= 10, height=20 )
}
## Warning: Removed 3 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Warning: Removed 20 rows containing missing values (`position_stack()`).
Look at the slides of the samples that were not initialized and not estimated.
show_distance_distribution_without_estimate <- function(sample){
sample_distance_data <- all_distances_data %>% filter(tnumber %in% sample$tnumber) %>% filter(phenotype_combo %in%
sample$phenotype_combo)
dist <- ggplot(data= sample_distance_data %>% filter(tnumber == sample['tnumber'][[1]]) %>% filter(phenotype_combo == sample['phenotype_combo'][[1]])) + geom_bar(aes(x=distance_window, y=N.per.mm2.scaled), stat="identity",alpha=0.5) +
xlim(0,300) + ylim(0,0.2) + theme_bw() + theme(legend.position = "none") + ggtitle(label = sample['phenotype_combo'][[1]])
slide <- ggplot(data=cells%>% filter(ImageNumber == sample['tnumber'][[1]]) %>%
filter(meta_description == sample['phenotype_from'][[1]] | meta_description == sample['phenotype_to'][[1]])) +
geom_point(aes(x=Location_Center_X, y=Location_Center_Y, color=meta_description), alpha =0.5) + xlim(0,1300) + ylim(0,1300) +
theme_bw() + ggtitle(paste('imagenumber: ', sample['tnumber'][[1]]))
return(plot_grid(dist, slide))
}
sample_points_notinit_and_notest <- sample_n(not_estimated_and_initialized_samples, 5)
for (row in 1:nrow(sample_points_notinit_and_notest)){
print(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[row,]))
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Warning: Removed 4 rows containing missing values (`position_stack()`).
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/not_init_and_estimated_samples.pdf'), plot = plot_grid(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[1,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[2,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[3,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[4,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[5,]), ncol=1), width= 10, height=20 )
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
We would like to find out why our method fails in initializing and estimating certain samples. A possible explanation is that the combination exist, but is very rare.
Zijn de samples met hoge counts in cell types met veel 0 counts, verrijkt in bepaaplde subtypes of iets dergelijks?
all_possibilities_counts <- all_possible_samples %>% count(phenotype_combo) %>%
separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE)
all_possibilities_matrix <- reshape2::dcast(all_possibilities_counts, phenotype_from~phenotype_to)
## Using n as value column: use value.var to override.
rownames(all_possibilities_matrix) <- all_possibilities_matrix$phenotype_from
all_possibilities_matrix <- all_possibilities_matrix %>% select(-c('phenotype_from')) %>% mutate_all( ~coalesce(.,0))
hm1 <- Heatmap((all_possibilities_matrix / length(unique(cells$ImageNumber)))*100,
name = '% samples', column_title = 'phenotype_to', row_title = 'phenotype_from',
row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
cluster_rows = T, cluster_columns = T,show_row_dend =F)
## Warning: The input is a data frame-like object, convert it to a matrix.
if (params$save_files){
save_pdf(hm1, here('output/Distance_results/Missing_estimations_analysis/allpossibleHM.pdf'), width=10, height=6)
}
hc = hclust(dist(t((all_possibilities_matrix / length(unique(cells$ImageNumber)))*100), method="euclidean"), method = "complete")
all_parameters_counts <- all_parameters %>% count(phenotype_combo) %>%
separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE)
all_parameters_matrix <- reshape2::dcast(all_parameters_counts, phenotype_from~phenotype_to)
## Using n as value column: use value.var to override.
rownames(all_parameters_matrix) <- all_parameters_matrix$phenotype_from
all_parameters_matrix <- all_parameters_matrix %>% select(-c('phenotype_from')) %>% mutate_all( ~coalesce(.,0))
all_parameters_matrix <- (all_parameters_matrix / all_possibilities_matrix)*100
hm2 <- Heatmap(all_parameters_matrix,
name = '% estimated', column_title = 'phenotype_to', row_title = 'phenotype_from',
row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
cluster_rows = T, cluster_columns = hc, show_heatmap_legend = F)
## Warning: The input is a data frame-like object, convert it to a matrix.
if (params$save_files){
save_pdf(hm2, here('output/Distance_results/Missing_estimations_analysis/estimatedHM.pdf'), width=10, height=6)
}
not_estimated_counts <- not_estimated_samples %>% count(phenotype_combo) %>%
separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE)
not_estimated_matrix <- reshape2::dcast(not_estimated_counts, phenotype_from~phenotype_to)
## Using n as value column: use value.var to override.
rownames(not_estimated_matrix) <- not_estimated_matrix$phenotype_from
not_estimated_matrix <- not_estimated_matrix %>% select(-c('phenotype_from')) %>% mutate_all( ~coalesce(.,0))
not_estimated_matrix <- (not_estimated_matrix / all_possibilities_matrix)*100
hm3 <- Heatmap(not_estimated_matrix,
name = '% unestimated samples', column_title = 'phenotype_to', row_title = 'phenotype_from',
row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
cluster_rows = T, cluster_columns = hc,show_heatmap_legend = F)
## Warning: The input is a data frame-like object, convert it to a matrix.
if (params$save_files){
save_pdf(hm3, here('output/Distance_results/Missing_estimations_analysis/notEstimatedHM.pdf'), width=10, height=6)
}
hm_list <- hm1 + hm2 + hm3
draw(hm_list)
medians_from <- not_estimated_and_initialized_samples %>% group_by(phenotype_from) %>% summarise(median = median(n_from))
medians_to <- not_estimated_and_initialized_samples %>% group_by(phenotype_to) %>% summarise(median = median(n_to))
t <- reshape2::melt(not_estimated_and_initialized_samples, id = c('unique_sample', 'tnumber', 'phenotype_combo', 'phenotype_from', 'phenotype_to')) %>% mutate(type = ifelse(variable == 'n_from', phenotype_from, phenotype_to))
ggplot(data=reshape2::melt(not_estimated_and_initialized_samples, id = c('unique_sample', 'tnumber', 'phenotype_combo', 'phenotype_from', 'phenotype_to')) %>% mutate(type = ifelse(variable == 'n_from', phenotype_from, phenotype_to))) + geom_boxplot(aes(x=type, y=value, fill = variable)) + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_text(data=medians_from, aes(x=phenotype_from,y=median, label=median),
size = 3, vjust = -15,color='red') + ggtitle('cell type counts of unestimated combinations with medians in red') + xlab('phenotype') + ylab('n')
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/notestimatedcounts_boxplot1.pdf'),width=10, height=5)
}
There are still samples with high counts, but without estimation.
print(paste('Number of unestimated samples with phenotype_to or phenotype_from > 1000:', nrow(not_estimated_and_initialized_samples %>% filter(n_from > 1000 | n_to > 1000))))
## [1] "Number of unestimated samples with phenotype_to or phenotype_from > 1000: 152"
sample_points_notinit_and_notest <- sample_n(not_estimated_and_initialized_samples %>% filter(n_from > 1000 | n_to > 1000),5)
for (row in 1:nrow(sample_points_notinit_and_notest)){
print(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[row,]))
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/one_count_high_failed_samples.pdf'), plot = plot_grid(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[1,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[2,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[3,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[4,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[5,]), ncol=1), width= 10, height=20 )
}
## Warning: Removed 4 rows containing missing values (`position_stack()`).
There are also samples with high counts of both from and to cell type.
print(paste('Number of unestimated samples with both phenotype_to and from count 50+ :', nrow(not_estimated_and_initialized_samples %>% filter(n_from > 50 & n_to > 50))))
## [1] "Number of unestimated samples with both phenotype_to and from count 50+ : 8"
sample_points_notinit_and_notest <- not_estimated_and_initialized_samples %>% filter(n_from > 50 & n_to > 50)
for (row in 1:nrow(sample_points_notinit_and_notest)){
print(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[row,]))
}
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/both_count_high_failed_samples1.pdf'), plot = plot_grid(show_distance_distribution_without_estimate(sample_points_notinit_and_notest[1,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[2,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[3,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[4,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[5,]), ncol=1), width= 10, height=20)
ggsave(here('output/Distance_results/Missing_estimations_analysis/both_count_high_failed_samples2.pdf'), plot = plot_grid(
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[6,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[7,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[8,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[9,]),
show_distance_distribution_without_estimate(sample_points_notinit_and_notest[10,]), ncol=1), width= 10, height=20 )
}
Are the distributions of unestimated samples with high counts very different from the other distributions?
compare_samples <- function(highlight_tnumber, c){
dist <- bind_rows(estimated_distances %>% filter(phenotype_combo == c) %>% group_by(distance_window)%>% dplyr::summarize(N.per.mm2.scaled = mean(N.per.mm2.scaled)) %>%
mutate(plot = ' other estimates'), all_distances_data %>% filter((tnumber == highlight_tnumber & phenotype_combo == c)) %>% mutate(plot = 'not estimated distributions')) %>%
ggplot(aes(x=distance_window, y=N.per.mm2.scaled, fill=plot)) +
geom_bar(stat="identity", position = position_identity(), alpha=0.5) + theme_bw() +
ggtitle(paste(c, '(', length(unique(estimated_distances %>% filter(phenotype_combo == c) %>% pull(tnumber))), 'estimates)'))
return(dist)
}
sample_points_notinit_and_notest <- not_estimated_and_initialized_samples %>% filter(n_from > 50 & n_to > 50)
for (row in 1:nrow(sample_points_notinit_and_notest)){
print(compare_samples(sample_points_notinit_and_notest[row,]['tnumber'][[1]],sample_points_notinit_and_notest[row,]['phenotype_combo'][[1]] ))
}
Are specific low counts correlated with cancer subtype?
column_order <- function(dataset, occ_number, subtype){
cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
group_by(phenotype_from, phenotype_to) %>%
count(category) %>%
mutate(n = n/ length(unique(dataset %>% pull(tnumber))))
df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))
trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
hc = hclust(dist(t(trans_df), method="euclidean"), method = "complete")
return(hc)
}
heatmap_occurences <- function(dataset, occ_number, subtype, cluster_columns=T,show_heatmap_legend = T){
cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
group_by(phenotype_from, phenotype_to) %>%
count(category) %>%
mutate(n = n/ length(unique(dataset %>% pull(tnumber))))
df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))
trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
hm <- Heatmap(trans_df, cluster_rows = cluster_columns, cluster_columns = cluster_columns, row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
name = paste('sample percentage \n of type', subtype, '\n with', paste(as.character(occ_number),'+',sep=''), 'occ.'), column_title = 'phenotype_to', row_title = 'phenotype_from', show_heatmap_legend = show_heatmap_legend)
return(hm)
}
hc = column_order(cell_occurences_combinations, 0,'ALL')
hm_all <- heatmap_occurences(cell_occurences_combinations, 0,'ALL', cluster_columns = hc)
print(hm_all)
subtypes = group_split(clinical_data %>% group_by(PAM50))
for (s in 1:length(subtypes)){
subtype = subtypes[[s]]
hm <- heatmap_occurences(cell_occurences_combinations %>% filter(tnumber %in% subtype$ImageNumber), 0,unique(subtype$PAM50), cluster_columns = hc, show_heatmap_legend = T)
print(hm)
}
get_combination_occurences <- function(){
parameters_with_subtypes <- merge(all_parameters, clinical_data %>% select(c("ImageNumber", "PAM50")), by.x='tnumber', by.y='ImageNumber')
combination_occurences <- (parameters_with_subtypes %>% group_by(`PAM50`, phenotype_combo) %>% summarise(n = n())) %>% separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE)
subtype_count <- clinical_data %>% group_by(`PAM50`) %>% summarise(n_proportion = n()/794)
combination_occurences <- merge(merge(combination_occurences, combination_occurences %>% group_by(phenotype_combo) %>% summarise(n_total = sum(n)), by='phenotype_combo'), subtype_count, by.x='PAM50', by.y='PAM50') %>% mutate(expected_n = (n_proportion) * n_total)
return(combination_occurences)
}
combination_occurences <- get_combination_occurences()
## `summarise()` has grouped output by 'PAM50'. You can override using the
## `.groups` argument.
lvls <- combination_occurences %>% group_by(phenotype_from) %>% summarise(n=n()) %>% arrange(n) %>% pull(phenotype_from)
ggplot(data=combination_occurences) + geom_bar(aes(x=factor(phenotype_from, levels=lvls), y=n, fill=PAM50), stat="identity",position='fill') + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=7)) + facet_wrap(vars(phenotype_to), ncol = 8) + xlab('phenotype_from')
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/estimation_counts.pdf'), height=20, width=30)
}
ggplot(data=combination_occurences %>% filter(n_total > (794*0.5))) + geom_bar(aes(x=factor(phenotype_from, levels=lvls), y=n, fill=PAM50), stat="identity",position='fill') + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=7)) + facet_wrap(vars(phenotype_to), ncol = 8) + xlab('phenotype_from')
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/estimation_counts_50percent.pdf'), height=20, width=30)
}
dominant_estimations <- unique(combination_occurences %>% filter(n_total > (794*0.5)) %>% filter(n > (expected_n*2) | n < (expected_n*(1/2)))%>% pull(phenotype_combo))
ggplot(data=combination_occurences %>% filter(phenotype_combo %in% dominant_estimations)) + geom_bar(aes(x=factor(phenotype_from, levels=lvls), y=n, fill=PAM50), stat="identity",position='fill') + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size=7)) + facet_wrap(vars(phenotype_to), ncol = 8) + xlab('phenotype_from')
if (params$save_files){
ggsave(here('output/Distance_results/Missing_estimations_analysis/dominant_estimation_counts.pdf'), height=20, width=30)
}
Filter out the outliers.
# distances <- tibble(distance_window = c(1:600))
# unique_samples <- unique(estimated_distances %>% pull(unique_sample))
# # unique_samples <- c('134_CK^{lo}ER^{med}_to_ER^{hi}CXCL12^{+}')
# estimated_distances <- estimated_distances %>% select(-c(tnumber, phenotype_combo))
#
# library(doMC)
#
# cores = detectCores()
# registerDoMC(cores[1])
# RMSE <- do.call(rbind, mclapply(unique_samples, function(s){
# distance_combinations <- tidyr::crossing(s, distances) %>% rename(unique_sample = 1)
# estimated_distances_sample <- merge(distance_combinations, estimated_distances %>%
# filter(unique_sample == s), by = c('unique_sample', 'distance_window'), all.x=T) %>% replace_na(list(N.per.mm2.scaled = 0))
# estimated_distances_sample <- merge(estimated_distances_sample, all_parameters %>% select(c(unique_sample, shape,scale)), by='unique_sample',all.x = T)
# estimated_distances_sample <- estimated_distances_sample %>% mutate(predicted = dweibull(x=distance_window, shape=shape, scale=scale))
# estimated_distances_sample <- estimated_distances_sample %>% mutate(residual = (predicted - N.per.mm2.scaled))
# estimated_distances_sample <- estimated_distances_sample %>% mutate(sqt_residuals = residual^2)
#
# RMSE_sample <- estimated_distances_sample %>% group_by(unique_sample) %>%
# summarise(length_vector = c_across(cols = c(sqt_residuals)) %>% length(),
# sum_vector = c_across(cols = c(sqt_residuals)) %>% sum(),
# mean_error = sum_vector/length_vector,
# MSE = mean_error %>% sqrt())
# return(RMSE_sample)
# },mc.cores = 24))
# NOT PARALLELLIZED METHOD
# RMSE <- data.frame(matrix(nrow = 0, ncol = 5))
# for (s in unique_samples){
# distance_combinations <- tidyr::crossing(s, distances) %>% rename(unique_sample = 1)
# estimated_distances_sample <- merge(distance_combinations, estimated_distances %>% filter(unique_sample == s), by = c('unique_sample', 'distance_window'), all.x=T) %>% replace_na(list(N.per.mm2.scaled = 0))
# estimated_distances_sample <- merge(estimated_distances_sample, all_parameters %>% select(c(unique_sample, shape,scale)), by='unique_sample',all.x = T)
# estimated_distances_sample <- estimated_distances_sample %>% mutate(predicted = dweibull(x=distance_window, shape=shape, scale=scale))
# estimated_distances_sample <- estimated_distances_sample %>% mutate(residual = (predicted - N.per.mm2.scaled))
# estimated_distances_sample <- estimated_distances_sample %>% mutate(sqt_residuals = residual^2)
#
# RMSE_sample <- estimated_distances_sample %>% group_by(unique_sample) %>%
# summarise(length_vector = c_across(cols = c(sqt_residuals)) %>% length(),
# sum_vector = c_across(cols = c(sqt_residuals)) %>% sum(),
# mean_error = sum_vector/length_vector,
# MSE = mean_error %>% sqrt())
#
# RMSE <- rbind(RMSE, RMSE_sample)
# }
# saveRDS(RMSE, file=here('scratch/RMSE.rds'))
# Compute predicted and observed median
# total_observations <- estimated_distances %>% group_by(unique_sample) %>%
# summarise(total_observations= c_across(cols = c(N.per.mm2.scaled)) %>% sum())
#
# medians <- merge(estimated_distances, total_observations, by='unique_sample', all.x=T)
# medians <- medians %>% mutate(frequency = round((N.per.mm2.scaled / total_observations)*10000))
# medians <- medians %>% group_by(unique_sample) %>% summarise(
# median_observations = median(rep(distance_window,frequency))
# )
#
# medians <- merge(medians, all_parameters %>% select(c(unique_sample, shape,scale)), by='unique_sample',all.y = T)
# medians <- medians %>% mutate(median_weibull = scale * (log(2))^(1/shape))
# medians <- medians %>% mutate(difference = abs(median_weibull - median_observations))
# saveRDS(medians, file=here('scratch/medians.rds'))
RMSE <- read_rds(here('scratch/RMSE.rds'))
quantiles <- as_tibble(quantile(RMSE$MSE,probs=c(seq(0,1,0.1),0.95,0.99,0.999))) %>% mutate(quantiles=paste(100* c(seq(0,1,0.1),0.95,0.99,0.999), '%'))
ggplot() + geom_point(data=RMSE,aes(x=MSE, y=1),alpha=0.05,size=5) + geom_point(data=quantiles, aes(x=value,y=1,color=quantiles),size=7,shape=3) +
scale_color_discrete(breaks=paste(100* c(seq(0,0.9,0.1),0.95,0.99,0.999,1), '%'))+ theme_bw() + xlab('RMSE')
ggplot() + geom_histogram(data=RMSE, aes(x=MSE)) + scale_y_log10() + geom_vline(data=quantiles %>% filter(quantiles %in% c('90 %','95 %','99 %','99.9 %')), aes(xintercept=value,color=quantiles),linetype="dashed") +
scale_color_discrete(breaks=paste(100* c(seq(0,0.9,0.1),0.95,0.99,0.999,1), '%')) +theme_bw() + xlab('RMSE')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 14 rows containing missing values (`geom_bar()`).
threshold <- quantiles %>% filter(quantiles == '99 %') %>% pull(value)
outliers <- RMSE %>% filter(MSE > threshold) %>% pull(unique_sample)
outlier_points <- all_parameters %>% filter((unique_sample %in% outliers))
medians <- read_rds(here('scratch/medians.rds'))
quantiles <- as_tibble(quantile(medians$difference,probs=c(seq(0,1,0.1),0.95,0.99,0.999))) %>% mutate(quantiles=paste(100*c(seq(0,1,0.1),0.95,0.99,0.999), '%'))
ggplot() + geom_point(data=medians,aes(x=difference, y=1),alpha=0.05,size=5) + geom_point(data=quantiles, aes(x=value,y=1,color=quantiles),size=7,shape=3) +
scale_color_discrete(breaks=paste(100* c(seq(0,0.9,0.1),0.95,0.99,0.999,1), '%')) + theme_bw()
ggplot() + geom_histogram(data=medians, aes(x=difference)) + scale_y_log10() + geom_vline(data=quantiles %>% filter(quantiles %in% c('90 %','95 %','99 %','99.9 %')), aes(xintercept=value,color=quantiles),linetype="dashed") +
scale_color_discrete(breaks=paste(100* c(seq(0,0.9,0.1),0.95,0.99,0.999,1), '%')) + theme_bw() + xlab('delta median')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
threshold <- quantiles %>% filter(quantiles == '95 %') %>% pull(value)
outliers <- medians %>% filter(difference > 80) %>% pull(unique_sample)
outlier_points <- all_parameters %>% filter((unique_sample %in% outliers))
saveRDS(outlier_points, here('scratch/outlier_parameters.rds'))
Do the parameters look like a C-curve?
all_parameters <- getALLParameters()
outliers <- read_rds(here('scratch/outlier_parameters.rds')) %>% pull(unique_sample)
ggplot() +
geom_point(data = all_parameters %>% filter((unique_sample %in% outliers)), aes(x=shape, y=scale,color='outlier'), alpha=0.1, size=0.5) +
geom_point(data = all_parameters %>% filter(!(unique_sample %in% outliers)), aes(x=shape, y=scale), color='black',alpha=0.1, size=0.5) +
geom_density_2d(data = all_parameters, aes(x=shape, y=scale)) +
theme_bw() +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale') + ggtitle("All parameters with outliers") +
guides(colour = guide_legend(override.aes = list(size=10)))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing non-finite values (`stat_density2d()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Warning: Removed 262 rows containing missing values (`geom_point()`).
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), 'parameters_with_outliers_median.pdf', sep=''),width=6,height=4)
## Warning: Removed 271 rows containing non-finite values (`stat_density2d()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Warning: Removed 262 rows containing missing values (`geom_point()`).
all_parameters_withMedianDifferences <- merge(all_parameters, medians %>% select(c(unique_sample, difference)), by='unique_sample', all.x=T)
ggplot() +
geom_point(data = all_parameters_withMedianDifferences, aes(x=shape, y=scale,colour=difference),size=0.5, alpha=1) +
scale_colour_continuous() +
scale_colour_gradient2(midpoint=25, mid='white',low="black", high="red", space ="Lab", name='delta median' ) +
theme_bw() +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale') + ggtitle("Parameters delta median difference")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), 'parameters_withdeltamedian.pdf', sep=''),width=6,heigh=4)
## Warning: Removed 271 rows containing missing values (`geom_point()`).
Alberto_parameters <- read_tsv(here('DATA/global_weib_coeff_oct.tsv')) %>% mutate(type='NABUCCO')
## Rows: 1152 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): sample, phenotype_combo, translational_response, Biopsy, PA_Respons...
## dbl (5): a, b, A, B, ptID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot() +
geom_point(data = all_parameters %>% filter(!(unique_sample %in% outliers)) %>% mutate(type='METABRIC'), aes(x=shape, y=scale, color=type), alpha=0.1, size=0.5) +
geom_point(data = Alberto_parameters, aes(x=a, y=b, color=type),alpha = 0.5, size=0.5) +
theme_bw() +scale_color_manual(values=c("black", "red")) +
labs(color=NULL) +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale') + ggtitle("METABRIC and NABCUCCO parameters") +
guides(colour = guide_legend(override.aes = list(size=10)))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 262 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), 'allparameters_withNABUCCO.pdf', sep=''))
## Saving 7 x 5 in image
## Warning: Removed 262 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
Look at the extremes of the curve.
all_parameters_withoutOutliers <- all_parameters %>% filter(!(unique_sample %in% outliers))
# Pick extremes
# Divide the curve in areas and pick random from these areas
bottom_right <- sample_n(all_parameters_withoutOutliers %>% filter(shape > 5) %>% filter(scale < 20),1) %>% mutate(type= "low scale, high shape")
bottom_left <- sample_n(all_parameters_withoutOutliers %>% filter(shape < 2.5) %>% filter(shape >2) %>% filter(scale < 15),1)%>% mutate(type= "low scale, low shape")
mid_right <- sample_n(all_parameters_withoutOutliers %>% filter(shape > 2.8) %>% filter(shape <3.5) %>% filter(scale > 200) %>%
filter(scale < 300) ,1)%>% mutate(type= "mid scale, high shape")
mid_left <- sample_n(all_parameters_withoutOutliers %>% filter(shape < 1) %>% filter(scale > 100)%>%
filter(scale < 120),1 )%>% mutate(type= "mid scale, low shape")
# top_right <- sample_n(all_parameters %>% filter(shape > 3.5) %>% filter(scale > 350) ,1)%>% mutate(type= "high scale, high shape")
# top_left <- sample_n(all_parameters %>% filter(shape < 2) %>% filter(scale > 300),1 )%>% mutate(type= "high scale, low shape")
belly <- sample_n(all_parameters_withoutOutliers %>% filter(shape < 2.8) %>% filter(shape > 2.6) %>% filter(scale > 80) %>% filter(scale <90),1)%>% mutate(type= "belly")
sample_points <- rbind(bottom_right, bottom_left,mid_right, mid_left, belly)
print(sample_points$phenotype_combo)
## [1] "ER^{hi}CXCL12^{+}_to_CK8-18^{hi}CXCL12^{hi}"
## [2] "CD38^{+} lymphocytes_to_CD38^{+} lymphocytes"
## [3] "ER^{hi}CXCL12^{+}_to_Ep CD57^{+}"
## [4] "CK^{med}ER^{lo}_to_MHC I & II^{hi}"
## [5] "MHC I^{hi}CD57^{+}_to_Fibroblasts"
ggplot() +
geom_point(data = all_parameters_withoutOutliers, aes(x=shape, y=scale), alpha=0.1, size=0.5) +
geom_point(data = sample_points, aes(x=shape, y=scale, color=type), alpha=1, size=3) +
theme_bw() +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale')
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 262 rows containing missing values (`geom_point()`).
if (params$save_files){
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), 'extremeparameters.pdf', sep=''), width=6, height=4)
}
## Warning: Removed 262 rows containing missing values (`geom_point()`).
ggplot() +
stat_function(fun = dweibull, args = list(shape = bottom_right$shape, scale = bottom_right$scale), aes(colour='low scale, high shape')) +
stat_function(fun = dweibull, args = list(shape = bottom_left$shape, scale = bottom_left$scale),aes(colour='low scale, low shape')) +
stat_function(fun = dweibull, args = list(shape = mid_left$shape, scale = mid_left$scale), aes(colour='mid scale, low scale')) +
stat_function(fun = dweibull, args = list(shape = mid_right$shape, scale = mid_right$scale), aes(colour='mid scale, high scale')) +
# stat_function(fun = dweibull, args = list(shape = top_left$shape, scale = top_left$scale), aes(colour='high scale, low scale')) +
# stat_function(fun = dweibull, args = list(shape = top_right$shape, scale = top_right$scale), aes(colour='high scale, high shape')) +
stat_function(fun = dweibull, args = list(shape = belly$shape, scale = belly$scale),aes(colour='belly')) +
xlim(0,600) + theme_bw() + xlab('micron') + ylab('N')
if (params$save_files){
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), 'weibullcurves.pdf', sep=''),width=10, height=4)
}
sample_distance_data <- merge(x = all_distances_data %>% filter(tnumber %in% sample_points$tnumber) %>% filter(phenotype_combo %in%
sample_points$phenotype_combo),
y = sample_points %>% select(c(phenotype_combo, shape, scale, type)), all.X =T, by.x='phenotype_combo', by.y='phenotype_combo')
for (row in 1:nrow(sample_points)){
slide <- show_distance_distribution(sample_points[row,],0.2)
title <- paste(sample_points[row,]['type'][[1]], '(shape:', round(sample_points[row,]['shape'][[1]],2), 'scale:',round(sample_points[row,]['scale'][[1]],2), ')')
slideWithTitle <- addLabelToGrid(slide, title)
print(slideWithTitle)
if (params$save_files){
ggsave(here(paste('output/Distance_results/Alberto_reproduction/estimated_sample',row, '.pdf', sep='')),slideWithTitle, width=10, height=4)
}
}
Take a look at the outliers.
sample_outliers <- sample_n(outlier_points %>% filter(shape < 2),1)
for (i in 1:7){
outlier <- sample_n(outlier_points %>% filter(shape < sample_outliers[i,'shape']+0.55) %>% filter(shape > sample_outliers[i,'shape']+0.45),1)
sample_outliers <- rbind(sample_outliers, outlier)
}
ggplot() +
geom_point(data = all_parameters, aes(x=shape, y=scale), alpha=0.1, size=0.5) +
geom_point(data = sample_outliers, aes(x=shape, y=scale),colour='red', alpha=1, size=3) +
theme_bw() +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale')
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
if (params$save_files){
ggsave(paste(here('output/Distance_results/outlier_sample_parameters'),row,'.pdf',sep=''), width=10, height=4)
}
## Warning: Removed 271 rows containing missing values (`geom_point()`).
sample_outliers_withDistanceDifference <- merge(sample_outliers, medians %>% select(c(unique_sample, difference,median_observations, median_weibull)), by='unique_sample', all.x=T)
show_distance_distribution_outliers <- function(sample, ylimit=0.02){
sample_distance_data <- merge(x = all_distances_data %>% filter(tnumber %in% sample$tnumber) %>% filter(phenotype_combo %in%
sample$phenotype_combo),
y = sample %>% select(c(phenotype_combo, shape, scale)), all.X =T, by.x='phenotype_combo', by.y='phenotype_combo')
median_obs <- sample['median_observations'][[1]]
median_wb <- sample['median_weibull'][[1]]
dist <- ggplot(data= sample_distance_data %>% filter(tnumber == sample['tnumber'][[1]]) %>% filter(phenotype_combo == sample['phenotype_combo'][[1]])) + geom_bar(aes(x=distance_window, y=N.per.mm2.scaled), stat="identity",alpha=0.5, color="#00BFC4") + geom_vline(xintercept = median_obs, color="#00BFC4", linetype = "longdash") + geom_vline(xintercept = median_wb, color='#F8766D', linetype = "longdash") +
stat_function(fun = dweibull, args = list(shape = sample['shape'][[1]], scale = sample['scale'][[1]]), color='#F8766D')+
xlim(0,600) + ylim(0,ylimit) + theme_bw() + theme(legend.position = "none") + ggtitle(label = sample['phenotype_combo'][[1]])
slide <- ggplot(data=cells%>% filter(ImageNumber == sample['tnumber'][[1]]) %>%
filter(meta_description == sample['phenotype_from'][[1]] | meta_description == sample['phenotype_to'][[1]])) +
geom_point(aes(x=Location_Center_X, y=Location_Center_Y, color=meta_description), alpha =0.5) +
theme_bw() + ggtitle(paste('imagenumber: ', sample['tnumber'][[1]]))
return(plot_grid(dist, slide,ncol=2))
}
for (row in 1:nrow(sample_outliers_withDistanceDifference)){
slide <- show_distance_distribution_outliers(sample_outliers_withDistanceDifference[row,],0.05)
title <- paste('Outlier with delta difference:' , round(sample_outliers_withDistanceDifference[row,]['difference'][[1]],2), '\n(shape:', round(sample_outliers_withDistanceDifference[row,]['shape'][[1]],2), 'scale:',round(sample_outliers_withDistanceDifference[row,]['scale'][[1]],2), ')')
slideWithTitle <- addLabelToGrid(slide, title)
print(slideWithTitle)
if (params$save_files){
ggsave(here(paste('output/Distance_results/outlier_sample',row, '.pdf', sep='')),slideWithTitle, width=10, height=4)
}
}
outlier_counts <- outlier_points %>% count(phenotype_combo) %>%
separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE)
outlier_matrix <- reshape2::dcast(outlier_counts, phenotype_from~phenotype_to)
## Using n as value column: use value.var to override.
rownames(outlier_matrix) <- outlier_matrix$phenotype_from
outlier_matrix <- outlier_matrix %>% select(-c('phenotype_from')) %>% mutate_all( ~coalesce(.,0))
outlier_matrix <- (outlier_matrix / all_possibilities_matrix)*100
hm <- Heatmap(outlier_matrix,
name = '% outliers', column_title = 'phenotype_to', row_title = 'phenotype_from',
row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
cluster_rows = T, cluster_columns = T)
## Warning: The input is a data frame-like object, convert it to a matrix.
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
##
## Use `suppressMessages()` to turn off this message.
print(hm)
if (params$save_files){
save_pdf(hm, here('output/Distance_results/outliercounts.pdf'), width=10, height=6)
}
Take a better look at the belly points
belly_points <- sample_n(all_parameters_withoutOutliers %>% filter(shape < 1.1) %>% filter(shape > 1) %>% filter(scale > 60) %>% filter(scale <61),1)
for (i in 1:7){
belly <- sample_n(all_parameters_withoutOutliers %>% filter(shape < belly_points[i,'shape']+0.35) %>% filter(shape > belly_points[i,'shape']+0.25) %>% filter(scale > 60) %>% filter(scale <61),1)
belly_points <- rbind(belly_points, belly)
}
ggplot() +
geom_point(data = all_parameters_withoutOutliers, aes(x=shape, y=scale), alpha=0.1, size=0.5) +
geom_point(data = belly_points, aes(x=shape, y=scale),colour='red', alpha=1, size=3) +
theme_bw() +
ylim(10,300) + xlim(0,6) +
scale_y_log10() +
xlab('Shape') + ylab('Scale')
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 262 rows containing missing values (`geom_point()`).
ggsave(here('output/Distance_results/bellyline.pdf'), width=6, height=4)
## Warning: Removed 262 rows containing missing values (`geom_point()`).
belly_points <- belly_points %>% mutate(type = 'belly')
sample_distance_data <- merge(x = all_distances_data %>% filter(tnumber %in% belly_points$tnumber) %>% filter(phenotype_combo %in% belly_points$phenotype_combo),
y = belly_points %>% select(c(phenotype_combo, shape, scale, type)), all.X =T, by.x='phenotype_combo', by.y='phenotype_combo')
for (row in 1:nrow(belly_points)){
slide <- show_distance_distribution(belly_points[row,],0.025)
title <- paste(belly_points[row,]['type'][[1]], '(shape:', round(belly_points[row,]['shape'][[1]],2), 'scale:',round(belly_points[row,]['scale'][[1]],2), ')')
slideWithTitle <- addLabelToGrid(slide, title)
print(slideWithTitle)
ggsave(paste(here('output/Distance_results/bellyline_sample'),row,'.pdf',sep=''), width=10, height=4)
}
## Warning: Removed 6 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 3 rows containing missing values (`position_stack()`).
## Warning: Removed 3 rows containing missing values (`position_stack()`).
## Warning: Removed 5 rows containing missing values (`position_stack()`).
belly_selection <- all_parameters_withoutOutliers %>% filter(scale < 120) %>% filter(scale>25)
belowBelly_selection <- all_parameters_withoutOutliers %>% filter(scale<=25)
aboveBelly_selection <- all_parameters_withoutOutliers %>% filter(scale>=120)
plot_parameters <- function(df, cell_type, TME_types, Tumor_types){
point <- ggplot() +
geom_point(data=df %>% filter(phenotype_to != cell_type), aes(x=shape, y=scale,colour='all'),color='grey',alpha=0.1, size=0.5) +
geom_point(data=df %>% filter(phenotype_to == cell_type& phenotype_from %in% TME_types), aes(x=shape, y=scale,colour=paste('from', 'TME cell type', 'to', cell_type)),alpha=0.5, size=1) +
geom_point(data=df %>% filter(phenotype_to == cell_type & phenotype_from %in% Tumor_types), aes(x=shape, y=scale,colour=paste('from', 'tumour cell type', 'to', cell_type)),alpha=0.5, size=1) +
geom_point(data=df %>% filter(phenotype_to == cell_type & phenotype_from == cell_type), aes(x=shape, y=scale,colour=paste('from', cell_type, 'to', cell_type)),alpha=0.8, size=1) +
ylim(10,300) + xlim(0,6) +
theme_bw() + scale_y_log10() +
xlab('Shape') + ylab('Scale') +
theme(legend.position='bottom', legend.direction="vertical") + ggtitle(paste('From X to ', cell_type)) + guides(colour = guide_legend(override.aes = list(size=10))) +
labs(color=NULL)
return(point)
}
for ( t in getTumorAndTMETypes()$TME_types){
img <- plot_parameters(all_parameters,t,getTumorAndTMETypes()$TME_types,getTumorAndTMETypes()$tumor_types)
print(img)
ggsave(paste(here('output/Distance_results/Alberto_reproduction/parameters_to_'), gsub("[[:punct:]]", "", t),'.pdf',sep=''),height=8,width = 8)
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 268 rows containing missing values (`geom_point()`).
## Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 268 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 257 rows containing missing values (`geom_point()`).
## Warning: Removed 11 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Warning: Removed 257 rows containing missing values (`geom_point()`).
## Warning: Removed 11 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 245 rows containing missing values (`geom_point()`).
## Warning: Removed 14 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 245 rows containing missing values (`geom_point()`).
## Warning: Removed 14 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Removed 271 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Removed 271 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 257 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).
## Warning: Removed 257 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 10 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 247 rows containing missing values (`geom_point()`).
## Warning: Removed 19 rows containing missing values (`geom_point()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Warning: Removed 247 rows containing missing values (`geom_point()`).
## Warning: Removed 19 rows containing missing values (`geom_point()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 267 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
for ( t in getTumorAndTMETypes()$tumor_types){
img <- plot_parameters(all_parameters,t, getTumorAndTMETypes()$TME_types,getTumorAndTMETypes()$tumor_types)
print(img)
ggsave(paste(here('output/Distance_results/Alberto_reproduction/parameters_to_'), gsub("[[:punct:]]", "", t),'.pdf',sep=''),height=8,width = 8)
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 226 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 41 rows containing missing values (`geom_point()`).
## Warning: Removed 226 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 41 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 255 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 15 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 255 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 15 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 253 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 253 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 260 rows containing missing values (`geom_point()`).
## Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Removed 260 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 260 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Warning: Removed 260 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 7 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 270 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 270 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 268 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 268 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 270 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 270 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Warning: Removed 271 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Warning: Removed 264 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 259 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Warning: Removed 259 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 9 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Warning: Removed 6 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 269 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 258 rows containing missing values (`geom_point()`).
## Warning: Removed 13 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 258 rows containing missing values (`geom_point()`).
## Warning: Removed 13 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
## Warning: Removed 265 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 5 rows containing missing values (`geom_point()`).
The belly is explained by tumor to TME cell relationships. Compare this
to the NABUCCO parameters.
Alberto_parameters <- read_tsv(here('DATA/global_weib_coeff_oct.tsv')) %>% mutate(type='NABUCCO')
## Rows: 1152 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): sample, phenotype_combo, translational_response, Biopsy, PA_Respons...
## dbl (5): a, b, A, B, ptID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Alberto_parameters <- Alberto_parameters %>% separate(phenotype_combo, c('phenotype_from', 'phenotype_to'), sep = '_to_') %>% rename(shape = a, scale = b)
Alberto_TME_types <- c("CD3+","CD68+","CD8+CD3+","CD20+","CD3+FoxP3+")
Alberto_Tumor_types <- c("PanCK+")
for ( t in Alberto_TME_types){
img <- plot_parameters(Alberto_parameters,t, Alberto_TME_types, Alberto_Tumor_types)
print(img)
# ggsave(paste(here('output/Distance_results/Alberto_reproduction/Albertoparameters_to_'), gsub("[[:punct:]]", "", t),'.pdf',sep=''),height=8,width = 8)
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
for ( t in Alberto_Tumor_types){
img <- plot_parameters(Alberto_parameters,t, Alberto_TME_types, Alberto_Tumor_types)
print(img)
# ggsave(paste(here('output/Distance_results/Alberto_reproduction/Albertoparameters_to_'), gsub("[[:punct:]]", "", t),'.pdf',sep=''),height=8,width = 8)
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 1 rows containing missing values (`geom_point()`).
We see a similar distribution in Alberto’s data.
Now look at specific ‘X to celltype’ parameters similar to the paper by Gil-Jimenez et al.
plot_parameters <- function(combi){
point <- all_parameters %>%
filter(phenotype_to == combi) %>%
filter(phenotype_from %in% six_combinations) %>%
ggplot(aes(x=shape, y=scale, color=phenotype_from)) +
geom_point(alpha=0.4, size=1) +
ylim(10,300) + xlim(0,6) +
theme_bw() + scale_y_log10() +
xlab('Shape') + ylab('Scale') +
theme(legend.position='right') + ggtitle(paste('From X to ', combi)) + guides(colour = guide_legend(override.aes = list(size=10)))
print(point)
if (params$save_files){
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'), gsub("[[:punct:]]", "", combi), '_parameters.pdf', sep=''))
}
}
six_combinations <- c('B cells','CD4^{+} T cells',
'Macrophages','CD8^{+} T cells','T_{Reg} & T_{Ex}')
for (c in six_combinations){
plot_parameters(c)
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
for (c in six_combinations){
plot_parameters(c)
grid <- plot_grid(ggplot(data = all_parameters %>%
filter(phenotype_to == c) %>%
filter(phenotype_from %in% six_combinations)) +
geom_boxplot(aes(x=phenotype_from, y = shape)) +
theme_bw() + ggtitle(paste('From X to ', c))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)),
ggplot(data = all_parameters %>%
filter(phenotype_to == c) %>%
filter(phenotype_from %in% six_combinations)) +
geom_boxplot(aes(x=phenotype_from, y = scale)) +
theme_bw() + ggtitle(paste('From X to ', c))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)))
print(grid)
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'),'boxplots', gsub("[[:punct:]]", "", c), '.pdf', sep=''))
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
Compare the parameters of the six cell types of Alberto with our parameters. Note: We map the cell types of Alberto to our cell type classification, but both groups are not entirely the same and the cell type groups of Alberto contain more diverse cells.
Alberto_parameters <- read_tsv(here('DATA/global_weib_coeff_oct.tsv')) %>% mutate(type='NABUCCO')
## Rows: 1152 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): sample, phenotype_combo, translational_response, Biopsy, PA_Respons...
## dbl (5): a, b, A, B, ptID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NABUCCO_parameters <- Alberto_parameters %>%
separate(phenotype_combo, into=c('phenotype_from','phenotype_to'),sep='_to_', remove = FALSE) %>%
mutate(type = 'NABUCCO') %>%
mutate(METABRIC_type_from = ifelse(phenotype_from == "PanCK+", 'CK8-18^{hi}CXCL12^{hi}',
ifelse(phenotype_from=="CD20+", 'B cells',
ifelse(phenotype_from=='CD3+', 'CD4^{+} T cells',
ifelse(phenotype_from=="CD68+", 'Macrophages',
ifelse(phenotype_from=="CD8+CD3+",'CD8^{+} T cells',
ifelse(phenotype_from=="CD3+FoxP3+",'T_{Reg} & T_{Ex}', 'negative'))))))) %>%
mutate(METABRIC_type_to = ifelse(phenotype_to == "PanCK+", 'CK8-18^{hi}CXCL12^{hi}',
ifelse(phenotype_to=="CD20+", 'B cells',
ifelse(phenotype_to=='CD3+', 'CD4^{+} T cells',
ifelse(phenotype_to=="CD68+", 'Macrophages',
ifelse(phenotype_to=="CD8+CD3+",'CD8^{+} T cells',
ifelse(phenotype_to=="CD3+FoxP3+",'T_{Reg} & T_{Ex}', 'negative'))))))) %>%
select(c(a,b,METABRIC_type_to, METABRIC_type_from)) %>%
rename(shape = a,
scale = b,
phenotype_from = METABRIC_type_from,
phenotype_to = METABRIC_type_to) %>%
mutate(type='NABUCCO')
NABUCCO_AND_METABRIC_parameters <- bind_rows(all_parameters %>% select(c(phenotype_from, phenotype_to, shape, scale)) %>%
mutate(type='METABRIC'),
NABUCCO_parameters) %>% filter(phenotype_to %in% six_combinations) %>% filter(phenotype_from %in% six_combinations)
res <- wilcox.test(NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_from==six_combinations[1] & phenotype_to==six_combinations[1] & type=='METABRIC') %>% pull(shape),
NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_from==six_combinations[1] & phenotype_to==six_combinations[1] & type=='NABUCCO') %>% pull(shape))
res
##
## Wilcoxon rank sum test with continuity correction
##
## data: NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_from == six_combinations[1] & phenotype_to == six_combinations[1] & type == "METABRIC") %>% pull(shape) and NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_from == six_combinations[1] & phenotype_to == six_combinations[1] & type == "NABUCCO") %>% pull(shape)
## W = 1344, p-value = 0.7007
## alternative hypothesis: true location shift is not equal to 0
# https://rdrr.io/cran/ggsignif/man/stat_signif.html
for (c in six_combinations){
group_ordered_shape <- with(NABUCCO_AND_METABRIC_parameters %>% filter(type=='METABRIC') %>% filter(phenotype_to==c), # Order boxes by median
reorder(phenotype_from,
shape,
median))
data_ordered_shape <- NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_to ==c)
data_ordered_shape$phenotype_from <- factor(data_ordered_shape$phenotype_from,
levels = levels(group_ordered_shape))
medians_shape <- data_ordered_shape %>% group_by(phenotype_from, type) %>% summarise(median = median(shape)) %>% arrange(factor(phenotype_from, levels = unique(group_ordered_shape)))
res_shape <- cor.test(medians_shape %>% filter(type=='METABRIC') %>% pull(median),medians_shape %>% filter(type=='NABUCCO') %>% pull(median), method='spearman')
group_ordered_scale <- with(NABUCCO_AND_METABRIC_parameters %>% filter(type=='METABRIC') %>% filter(phenotype_to==c), # Order boxes by median
reorder(phenotype_from,
scale,
median))
data_ordered_scale <- NABUCCO_AND_METABRIC_parameters %>% filter(phenotype_to ==c)
data_ordered_scale$phenotype_from <- factor(data_ordered_scale$phenotype_from,
levels = levels(group_ordered_scale))
medians_scale <- data_ordered_scale %>% group_by(phenotype_from, type) %>% summarise(median = median(scale)) %>% arrange(factor(phenotype_from, levels = unique(group_ordered_scale)))
res_scale <- cor.test(medians_scale %>% filter(type=='METABRIC') %>% pull(median),medians_scale %>% filter(type=='NABUCCO') %>% pull(median),method='spearman')
bp <- plot_grid(ggplot(data=data_ordered_shape %>%
filter(phenotype_to == c) %>%
filter(phenotype_from %in% six_combinations)) +
geom_boxplot(aes(x=phenotype_from, y=shape, fill=type)) +
theme_bw() + ggtitle(paste('From X to', c), subtitle= paste('rho:', round(res_shape$estimate[[1]],4)))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + theme(legend.position = "none"),
# + stat_compare_means(aes(x=phenotype_from, y=shape, group = type), label = "p.format",size=2)
ggplot(data=data_ordered_scale%>%
filter(phenotype_to == c) %>%
filter(phenotype_from %in% six_combinations)) +
geom_boxplot(aes(x=phenotype_from, y=scale, fill=type)) +
theme_bw() + ggtitle(paste('From X to ', c), subtitle= paste('rho:', round(res_scale$estimate[[1]],4)))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# + stat_compare_means(aes(x=phenotype_from, y=scale, group = type), label = "p.format",size=2)
,rel_widths = c(1.35,2))
print(bp)
if (params$save_files){
ggsave(paste(here('output/Distance_results/Alberto_reproduction/'),'compare_boxplots',gsub("[[:punct:]]", "", c),'.pdf', sep=''))
}
}
## `summarise()` has grouped output by 'phenotype_from'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'phenotype_from'. You can override using
## the `.groups` argument.
## Saving 7 x 5 in image
## `summarise()` has grouped output by 'phenotype_from'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'phenotype_from'. You can override using
## the `.groups` argument.
## Saving 7 x 5 in image
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## Saving 7 x 5 in image
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## Saving 7 x 5 in image
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'phenotype_from'. You can override using the `.groups` argument.
## Saving 7 x 5 in image
saveRDS(all_parameters_withoutOutliers, file = here('scratch/all_parameters_withoutOutliers.rds'))
transform_parameters_to_matrix(outliers = FALSE)
shape_matrix <- getShapeFeatures()
sum(!is.na(shape_matrix[,2:ncol(shape_matrix)])) == nrow(all_parameters_withoutOutliers)
## [1] TRUE